#set testing to FALSE to run full gene expression networks and to TRUE
#to run a subset.
#testing = TRUE
testing = FALSE
is.interactive = FALSE
#is.interactive = TRUE
net.type = "signed"
#net.type = "unsigned"

tissues = c("Islet", "Liver", "Adipose", "SkeletalMuscle")

args <- commandArgs(trailingOnly=T)
tissue.idx <- as.numeric(args[1])
clust.type <- args[2]

tissue.type <- tissues[tissue.idx]

if(is.na(tissue.type)){
  tissue.type = "Islet"
}

library(here)
## here() starts at /Users/atyler/Documents/Projects/Cube_Hackathon_P1
results.dir <- here("Results", paste0(clust.type, "_Clusters"), tissue.type)

The purpose of this workflow is to cluster bulk tissue gene expression from the DO mice with WGCNA and CoExpNets. Downstream analyses will compare the clustering from these two methods.

This workflow cluster transcripts separately in the specified tissue, and runs Islet by default. Other possible tissues are Adipose, Liver, and SkeletalMuscle.

all.fun <- list.files(here("Code"), full.names = TRUE, pattern = ".R")
for(i in 1:length(all.fun)){source(all.fun[i])}
all.packages <- c("pheatmap", "gprofiler2", "CoExpNets", "qtl2", "WGCNA")
load_libraries(all.packages)
exp.file <- here("Data", "RDS_datasets_tissues", paste0(tissue.type, ".RDS"))
tissue.exp <- readRDS(exp.file)

Filter Expression

Filter the expression matrices to include only genes that have at least a minimum amount of expression.

Here we use the raw expression matrix to select transcripts from the rank Z normalized expression matrix.

min.mean = 10

tissue.trans <- which(colMeans(tissue.exp$data$raw) > min.mean)
tissue.expr <- tissue.exp$data$rz[,tissue.trans]
tissue.covar <- tissue.exp$covar.matrix
adj.expr <- adjust(tissue.expr, tissue.covar)

Also collect clinical traits.

pheno <- read.csv(here("Data", "DO_clinical_phenotypes.csv"), stringsAsFactors = FALSE, row.names = 1)
#subset to numeric traits
num.pheno <- apply(pheno[,11:30], 2, as.numeric)
rownames(num.pheno) <- rownames(pheno)
adj.pheno <- adjust(num.pheno, tissue.covar)
gene.table <- tissue.exp$annot.mrna

Cluster gene expression using CoExpNets or WGCNA

CoExpNets is WGCNA plus a k-means clustering step that redistributes genes across modules. It generally identifies clusters with more even size distribution and more specific enrichments than WGCNA.

if(clust.type == "CoExpNets"){
    #Clustering takes a long time
    #set fullAnnotation to F because function defaults to human.
    #we will do this ourselves by hand.
    clust.results <- here("Results", "CoExpNet_Clusters")
    if(!file.exists(clust.results)){dir.create(clust.results)}
    if(!file.exists(results.dir)){dir.create(results.dir)}
    net.file <- get.files(results.dir, want = c("net", "rds"), dont.want = "pdf", 
    full.names = TRUE, ignore.case = FALSE)
    if(length(net.file) == 0){
    net.file = CoExpNets::getDownstreamNetwork(tissue=tissue.type,
            net.type = net.type, debug=testing, expr.data = adj.expr,
            job.path= results.dir, save.plots = TRUE, fullAnnotation = F)
    }
    clust.net <- readRDS(net.file)
}
if(clust.type == "WGCNA"){
    wgcna.results <- here("Results", "WGCNA_Clusters")
    if(!file.exists(wgcna.results)){dir.create(wgcna.results)}
    if(!file.exists(results.dir)){dir.create(results.dir)}
    wgcna.net.file <- file.path(results.dir, paste0("net.RDS"))
    if(!file.exists(wgcna.net.file)){
    enableWGCNAThreads()

    powers = c(c(1:10), seq(from = 12, to=20, by=2))
    sft = pickSoftThreshold(adj.expr, powerVector = powers, verbose = 5)
    use.power <- sft$powerEstimate
    if(is.na(use.power)){use.power = 6}

    clust.net = blockwiseModules(adj.expr, power = use.power, TOMType = net.type, 
    minModuleSize = 10, reassignThreshold = 0, mergeCutHeight = 0.25, 
    numericLabels = TRUE, pamRespectsDendro = FALSE,saveTOMs = FALSE)

    saveRDS(clust.net, wgcna.net.file)
    }else{
    clust.net <- readRDS(wgcna.net.file)
    }

    mergedColors = labels2colors(clust.net$colors)
    wgcna.modules <- clust.net$colors

    pdf(file.path(results.dir, "Modules.pdf"))
    for(i in 1:length(clust.net$dendrograms)){
        plotDendroAndColors(clust.net$dendrograms[[i]], 
    mergedColors[clust.net$blockGenes[[i]]], "Module colors", 
    dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)
        }
    dev.off()
}
## quartz_off_screen 
##                 2

Characterize Modules

In each tissue we characterized the modules using gprofiler2. We looked for enrichment in all GO domains, as well as KEGG and REACTOME pathways.

The following heatmaps show enriched terms for each set of modules. Shown are the top 10 most significant terms with fewer than 500 genes. The islet enrichment plot only shows the top 5 terms per group, since there are so many modules in this tissue.

module_genes <- function(net.obj){
  modules <- gsub("ME", "", colnames(net.obj$MEs))
  if(clust.type == "CoExpNets"){
    moduleV <- net.obj$moduleColors
  }
  if(clust.type == "WGCNA"){
    moduleV <- net.obj$colors
  }
  module.genes <- lapply(modules, function(x) names(moduleV)[which(moduleV == x)])
  names(module.genes) <- modules
  return(module.genes)
}

characterize_module <- function(enrich.table, n.terms = 10, 
pval_thresh = 0.05, max.term.size = NULL, order.by = "p_value",
decreasing = FALSE){

  low.info.words <- c("to", "of", "the", "process", "or", "by", "in", "and")

  if(length(enrich.table) == 0){return("none")}
  
  if(length(enrich.table) > 1){
    enrichments <- enrich.table[[1]]
    if(length(max.term.size) > 0){
      term.locale <- which(enrichments[,"term_size"] <= max.term.size)
    }else{
      term.locale <- 1:nrow(enrichments)
    }
    if(length(pval_thresh) > 0){
      sig.locale <- which(enrichments[,"p_value"] <= pval_thresh)
    }else{
      sig.locale <- 1:nrow(enrichments)
    }
    take.locale <- intersect(term.locale, sig.locale)
    all.terms <- enrichments[take.locale,"term_name"]
    if(!is.null(order.by)){
      ordered.terms <- enrichments[order(enrichments[,order.by], decreasing = decreasing),]
    }else{
      ordered.terms <- enrichments
    }
    all.terms <- ordered.terms[,"term_name"]
    term.words <- unique(unlist(strsplit(all.terms, " ")))
    pruned.words <- setdiff(term.words, low.info.words)
    top.words <- pruned.words[1:n.terms]
    mod.description <- paste(top.words, collapse = "-")
    return(mod.description)
  }
}

Module Enrichment

module.genes <- module_genes(clust.net)
saveRDS(module.genes, file.path(results.dir, "Module_Membership.RDS"))
enrich.file <- file.path(results.dir, "enrich.RDS")
if(!file.exists(enrich.file)){
  enrich <- lapply(module.genes, function(x) gost(x, organism = "mmusculus",
  sources = c("GO", "KEGG", "REACTOME")))
  saveRDS(enrich, enrich.file)
}else{
  enrich <- readRDS(enrich.file)
}
mod.names <- sapply(enrich, 
function(x) characterize_module(x, n.terms = 5, order.by = "p_value", max.term.size = 500))

jpeg(file.path(results.dir, "Enrichment.jpg"), 
width = 9, height = 32, units = "in", res = 300)
plot.enrichment.group(enrich, max.term.size = 500, plot.label = 
paste(tissue.type, "Modules"), n.terms = 5)
dev.off()
## quartz_off_screen 
##                 3
#plot.enrichment.group(enrich, max.term.size = 500, plot.label = 
#paste(tissue.type, "Modules"), n.terms = 5)

Trait-Module Correlation

We looked at the correlations between each module eigengene and each trait.

mod.eig <- as.matrix(clust.net$MEs)
rownames(mod.eig) <- rownames(adj.expr)
write.table(mod.eig, file.path(results.dir, "Module_Eigengenes.RDS"))

#align the phenotype and eigengene matrices
mod.pheno <- get.xz(mod.eig, adj.pheno)
cor.mat <- matrix(NA, nrow = ncol(mod.pheno$X), ncol = ncol(mod.pheno$Z))
rownames(cor.mat) <- paste0("ME", colnames(mod.pheno$X))
colnames(cor.mat) <- colnames(mod.pheno$Z)

for(i in 1:ncol(mod.pheno$X)){
  for(j in 1:ncol(mod.pheno$Z)){
    #plot.with.model(mod.pheno$X[,i], mod.pheno$Z[,j], report = "cor.test")
    mod.trait.cor <- cor(mod.pheno$X[,i], mod.pheno$Z[,j])
    cor.mat[i,j] <- mod.trait.cor
  }
}

jpeg(file.path(results.dir, "Trait_Module_Correlation.jpg"), 
width = 8, height = 8, units = "in", res = 300)
pheatmap(t(cor.mat))
dev.off()
## quartz_off_screen 
##                 3
#pheatmap(t(cor.mat))

Module mapping

Map each module eigengene. Compare these to the mapping of the WGCNA modules in Keller et al. 2018. They are pretty grassy, except for islet module skyblue3, which has a whopping QTL. I think this module is probably the same as Plum1 in Keller 2018.

#load genotype data
all.var <- ls()
data.loaded <- as.logical(length(which(all.var == "dataset.DO.Cube.Adipose")))
if(!data.loaded){
  load(here("Data", "dataset.DO.CUBE.multissue.RData"))
}
scan.file <- file.path(results.dir, "qtl.scan.RDS")
if(!file.exists(scan.file)){
 scans <- scan1(genoprobs, mod.eig)
 saveRDS(scans, scan.file)
}else{
  scans  <- readRDS(scan.file)
}

mod.dist <- dist(t(scans))
mod.order <- hclust(mod.dist)$order
thresh <- scans[,mod.order]
thresh[which(thresh > 8)] <- 8
jpeg(file.path(results.dir, "Module.Scans.jpg"), width = 12, height = 6, units = "in", 
res = 300)
par(xpd = TRUE)
multilod.plot(thresh, map = map, lod.thresh = 5, border.lwd = 1, 
row.names = colnames(mod.eig)[mod.order], row.name.shift = -nrow(thresh)*0.011, 
row.text.cex = 0.7, mar = c(2, 10, 2, 0))
## Loading required package: grid
par(xpd = FALSE)
dev.off()
## quartz_off_screen 
##                 2
#plot(islet.scans, "MEskyblue3", map = map)
#plot(islet.scans, "MEyellowgreen", map = map)